In [1]:
import plotly.graph_objects as go
import pandas as pd

class SankeyData:
    
    # From Tazer
    df_stat = list()
    # Sankey in Plotly, node - vertex, link - edge
    nodes = pd.DataFrame(columns=['label', 'color'])
    links = pd.DataFrame(columns=['source', 'target', 'value'])
    
    color_map = {"task": "red",
                 "file": "blue",
                 "none": "grey"
                }
    block_size = 1024
        
    def __init__(self):
        pass
        
    def load_stat(self, df_stat, params):
        
        self.df_stat.append({ "df": df_stat,
                              "filename": params['filename'],
                              "type": params['type'],
                              "size": params['size'],
                             "task_name": params['task_name']
                            })

        # add a node entry (task)
        self.set_taskname(params['task_name'])

        # add a node entry (input/output file)
        # one filename entry is allowed
        if len(self.nodes[self.nodes.label == params['filename']]) == 0:
            self.add_node({'label': params['filename'],
                           'color': self.color_map['file']})
            '''
            self.add_node({'label': 'no read',
                      'color': self.color_map['none']})
            self.add_node({'label': 'no write',
                      'color': self.color_map['none']})
            '''
        
    def set_taskname(self, name):
        if len(self.nodes[self.nodes.label == name]) == 0:
            self.add_node({'label': name,
                           'color': self.color_map['task']})
                    
    def get_node(self, idx):
        return self.nodes[idx]
    
    def add_node(self, node_dict):
        # label, color, customdata, x, y
        x = pd.DataFrame([node_dict])
        self.nodes = pd.concat([self.nodes, x], ignore_index=True)
        new_id = len(self.nodes)
        
    def get_link(self, name):
        return self.link[name]
    
    def set_links(self):
        pass
    
    def build_links(self, no_rw=False):

        links = []
        key = 'block_idx'
        for v in self.df_stat:
            # input (r)
            cnt = v['df'][key].nunique()
            io_type = v['type']
            fname = v['filename']
            tname = v['task_name']
            fidx = int(self.nodes[self.nodes.label == fname].index.values)
            tidx = int(self.nodes[self.nodes.label == tname].index.values)
            if io_type == "r":
                t2f = {'source': fidx,
                       'target': tidx,
                       'value': cnt}
                if no_rw:
                    t2n = {'source': fidx + 1,
                           'target': tidx,
                           'value': (v['size'] / self.block_size) - cnt}
            else:
                t2f = {'source': tidx,
                       'target': fidx,
                       'value': cnt}
                if no_rw:
                    t2n = {'source': tidx,
                           'target': fidx + 2,
                           'value': (v['size'] / self.block_size) - cnt}

            links.append(t2f) 
            links.append(t2n) if no_rw else None

        links = pd.DataFrame(links)
        self.links = links
    
    def plot(self):
        n = self.nodes[['label','color']].to_dict('list')
        l = self.links.to_dict('list')
        fig = go.Figure(data=[go.Sankey(
            node = n,
            link = l)])
        fig.show()    

    def reset(self):
        del(self.df_stat)
        del(self.nodes)
        del(self.links)    
In [2]:
import pandas as pd

# loading tazer stat files into pandas dataframe
def stat_to_df(fname):

    df = pd.read_csv(fname, sep=' ', names=['block_idx','frequency'], skiprows=1)
    return df
In [3]:
!ls tazer_stat/*/
tazer_stat/aggregate.py/:
aggregated.h5_r_stat task0001             task0004
aggregated.h5_w_stat task0002             task0005
task0000             task0003

tazer_stat/sim_emulator.py/:
task0000 task0001 task0002 task0003 task0004 task0005
In [4]:
!ls tazer_stat/*/*
tazer_stat/aggregate.py/aggregated.h5_r_stat
tazer_stat/aggregate.py/aggregated.h5_w_stat

tazer_stat/aggregate.py/task0000:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/aggregate.py/task0001:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/aggregate.py/task0002:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/aggregate.py/task0003:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/aggregate.py/task0004:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/aggregate.py/task0005:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/sim_emulator.py/task0000:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/sim_emulator.py/task0001:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/sim_emulator.py/task0002:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/sim_emulator.py/task0003:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/sim_emulator.py/task0004:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat

tazer_stat/sim_emulator.py/task0005:
residue_100_output.h5_r_stat residue_100_output.h5_w_stat
In [5]:
task_cnt = 6
sim_df_r = [None for x in range(task_cnt)]
sim_df_w = [None for x in range(task_cnt)]

agg_df_r = [None for x in range(task_cnt)]
agg_df_w = [None for x in range(task_cnt)]

for i in range(task_cnt):
    sim_df_r[i] = stat_to_df(f'tazer_stat/sim_emulator.py/task000{i}/residue_100_output.h5_r_stat')
    sim_df_w[i] = stat_to_df(f'tazer_stat/sim_emulator.py/task000{i}/residue_100_output.h5_w_stat')

    agg_df_r[i] = stat_to_df(f'tazer_stat/aggregate.py/task000{i}/residue_100_output.h5_r_stat')
    agg_df_w[i] = stat_to_df(f'tazer_stat/aggregate.py/task000{i}/residue_100_output.h5_w_stat')

    
agg_last_df_r = stat_to_df('tazer_stat/aggregate.py/aggregated.h5_r_stat')
agg_last_df_w = stat_to_df('tazer_stat/aggregate.py/aggregated.h5_w_stat')
In [6]:
sd = SankeyData()
for i in range(task_cnt):
    sd.load_stat(sim_df_r[i], {'filename': f'residue_100_output.h5 ({i})',
                    'type': 'r',
                    'size':10727328,
                    'task_name': 'sim_emulator.py'})
    
    sd.load_stat(sim_df_w[i], {'filename': f'residue_100_output.h5 ({i})',
                    'type': 'w',
                    'size':10727328,
                    'task_name': 'sim_emulator.py'})
    sd.load_stat(agg_df_r[i], {'filename': f'residue_100_output.h5 ({i})',
                    'type': 'r',
                    'size':10727328,
                    'task_name': 'aggregate.py'})
    
    sd.load_stat(agg_df_w[i], {'filename': f'residue_100_output.h5 ({i})',
                    'type': 'w',
                    'size':10727328,
                    'task_name': 'aggregate.py'})
In [7]:
sd.load_stat(agg_last_df_r, {'filename': 'aggregated.h5',
                    'type': 'r',
                    'size': 63502078,
                    'task_name': 'aggregate.py'})
sd.load_stat(agg_last_df_w, {'filename': 'aggregated.h5',
                    'type': 'w',
                    'size': 63502078,
                    'task_name': 'aggregate.py'})
In [8]:
sd.build_links()
In [9]:
sd.nodes
Out[9]:
label color
0 sim_emulator.py red
1 residue_100_output.h5 (0) blue
2 aggregate.py red
3 residue_100_output.h5 (1) blue
4 residue_100_output.h5 (2) blue
5 residue_100_output.h5 (3) blue
6 residue_100_output.h5 (4) blue
7 residue_100_output.h5 (5) blue
8 aggregated.h5 blue
In [10]:
sd.links
Out[10]:
source target value
0 1 0 5
1 0 1 10479
2 1 2 10479
3 2 1 73
4 3 0 5
5 0 3 10488
6 3 2 10488
7 2 3 73
8 4 0 5
9 0 4 10485
10 4 2 10485
11 2 4 73
12 5 0 5
13 0 5 10459
14 5 2 10459
15 2 5 73
16 6 0 5
17 0 6 10477
18 6 2 10477
19 2 6 73
20 7 0 5
21 0 7 10446
22 7 2 10446
23 2 7 73
24 8 2 2
25 2 8 62015
In [11]:
sd.plot()